#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(WGCNA)
## ==========================================================================
## *
## * Package WGCNA 1.66 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=4
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=4
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
library(expss)
library(polycor)
library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)
# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
rm(DE_info, GO_annotations, clusterings)
print(paste0('There are ', length(unique(dataset$Module)),' modules:'))
## [1] "There are 36 modules:"
dataset$Module %>% table %>% sort(decreasing=TRUE) %>% print
## .
## #53B400 #00BD64 #C17FFF #F17E4F #E76BF3 #F365E6 #C49A00 #FD6F86 #8195FF
## 1930 1555 1339 1253 1149 1086 977 942 856
## #00C0BB #4B9FFF #00C094 #00BB45 #00B70C #00BF7D #D29300 #FF61C5 gray
## 738 575 566 414 400 327 314 247 178
## #E88523 #F8766D #00A8FF #FF64B2 #75AF00 #8EAB00 #A58AFF #A3A500 #00BECD
## 176 160 131 128 125 120 114 99 97
## #D774FD #00B0F6 #00B6EB #00BBDD #FF699D #B4A000 #DE8C00 #FB61D7 #00C1A8
## 97 95 94 82 72 45 45 42 32
datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)
# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)
# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits, ML=TRUE)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated'))
}
## [1] "1 correlation(s) could not be calculated"
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]
# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = gsub('ME','',rownames(moduleTraitCor)),
yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
main = paste('Module-Trait relationships'))
rm(ME_object, datTraits, MEs, moduleTraitCor, moduleTraitPvalue, textMatrix)
Module-Diagnosis correlation and Gene Significance for the genes within each module make sense
plot_data = data.frame(MTcor=unique(dataset$MTcor)) %>% arrange(by=MTcor) %>% mutate(order=1:length(unique(dataset$MTcor))) %>%
left_join(dataset, by='MTcor') %>% filter(!is.na(MTcor)) %>% dplyr::select(ID, MTcor, GS, Module, order)
ggplotly(plot_data %>% ggplot(aes(order, GS, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() +
xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Gene Significance'))
ggplotly(plot_data %>% ggplot(aes(MTcor, GS)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) +
theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('Gene Significance'))
Module-Diagnosis correlation and log2FoldChange for the genes within each module make sense
plot_data = plot_data %>% left_join(genes_info, by='ID') %>% select(ID, order, Module, MTcor, log2FoldChange, baseMean)
ggplotly(plot_data %>% ggplot(aes(order, log2FoldChange, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_boxplot(fill=unique(plot_data$Module)) +
theme_minimal() + xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2FoldChange'))
ggplotly(plot_data %>% ggplot(aes(MTcor, log2FoldChange)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) +
theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('log2FoldChange'))
Modules with negative module-trait correlations seem to have higher mean expression than modules with a positive relation
ggplotly(plot_data %>% ggplot(aes(order, log2(baseMean+1), group=order)) +
geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') +
geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() +
xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2(Mean Expression)'))
ggplotly(plot_data %>% ggplot(aes(MTcor, log2(baseMean+1))) + geom_point(alpha=0.2, color=plot_data$Module, aes(id=ID)) +
geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') +
geom_smooth(color='gray', alpha=0.3) + theme_minimal() + xlab('Module-Diagnosis correlation'))
There doesn’t seem to be a positive relation between Module-Diagonsis (absolute) correlation and the percentage of SFARI genes in the module (I hoped there would be)
plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>%
group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>%
left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>%
mutate(p=round(p/n*100,2))
for(i in 1:nrow(plot_data)){
this_row = plot_data[i,]
if(this_row$hasSFARIscore==FALSE & this_row$p==100){
new_row = this_row
new_row$hasSFARIscore = TRUE
new_row$p = 0
plot_data = plot_data %>% rbind(new_row)
}
}
plot_data = plot_data %>% filter(hasSFARIscore==TRUE)
ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', alpha=0.1) +
geom_point(color=plot_data$Module, aes(id=Module)) +
xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
theme_minimal() + theme(legend.position = 'none'))
rm(this_row, i)
There doesn’t seem to be a relation between a module’s mean expression and the percentage of SFARI genes it contains (I would have thought there would be)
aux_data = data.frame('Module'=dataset$Module, 'meanExpr'=rowMeans(datExpr)) %>% group_by(Module) %>%
summarise(meanExpr=mean(meanExpr))
plot_data = plot_data %>% left_join(aux_data, by='Module')
ggplotly(plot_data %>% ggplot(aes(meanExpr, p, size=n)) + geom_smooth(color='gray', alpha=0.1) +
geom_point(color=plot_data$Module, aes(id=Module)) +
theme_minimal() + theme(legend.position = 'none'))
An analysis of the genes belonging to the three modules with the highest Module-Diagonsis correlation can be found in PhD-Models/FirstPUModel/RMarkdowns/19_10_16_dataset_EA.html